#### R Code for "Sarracenia pitcher plant-associated microbial communities differ primarily by host species across a longitudinal gradient" 
#### March 17, 2022

#Load packages####
if (!require("vegan")) {install.packages("vegan"); require("vegan")}
if (!require("ggplot2")) {install.packages("ggplot2"); require("ggplot2")}
if (!require("picante")) {install.packages("picante"); require("picante")}
if (!require("nVennR")) {install.packages("nVennR"); require("nVennR")}
if (!require("effects")) {install.packages("effects"); require("effects")}
if (!require("lme4")) {install.packages("lme4"); require("lme4")}
if (!require("reshape2")) {install.packages("reshape2"); require("reshape2")}
if (!require("devtools")) {install.packages("devtools"); require("devtools")}
if (!require("pairwiseAdonis")) {devtools::install_github("pmartinezarbizu/pairwiseAdonis/pairwiseAdonis"); require("pairwiseAdonis")}
if (!require("iNEXT")) {install.packages("iNEXT"); require("iNEXT")}
if (!require("raster")) {install.packages("raster"); require("raster")}
#install phyloseq package from Bioconductor
if (!require("BiocManager")) {install.packages("BiocManager"); require("BiocManager")}
if (!require("phyloseq")) {BiocManager::install("phyloseq"); require("phyloseq")}

# Set working directory ####
#Working directory
setwd("Your_Directory")

#Load data-sets ####

##Sample community data, Format: Samples are rows, ASVs (species) are columns
#16s (bacterial) data
sar16s <- read.csv("16S_asv-table-dada2-GC-Sarracenia.csv", header = T, row.names = 1, check.names = F) 
#18s (eukaryotic) data
sar18s <- read.csv("18S_asv-table-dada2-GC-Sarracenia.csv", header = T, row.names = 1, check.names = F) 

##Metadata, Format: samples are rows, variables are columns
mdat <- read.csv("Metadata-GC-Sarracenia.csv", header = T, row.names = 1)

##Taxonomy tables
sar16s_tax <- read.csv("16S-Taxonomy-GC-Sarracenia.csv", header= T, row.names = 1)
sar18s_tax <- read.csv("18S-Taxonomy-GC-Sarracenia.csv", header= T, row.names = 1)

#Check to see if aligned
row.names(sar16s) == row.names(sar18s)
row.names(mdat) == row.names(sar18s)

#Summarize datasets
summary(rowSums(sar16s))
summary(colSums(sar16s))
summary(rowSums(sar18s))
summary(colSums(sar18s))

#Remove non-target ASVs 
sar16s <- sar16s[,!(names(sar16s) %in% "11fff30dfa1880ff5b3d098f8361c90c")] ## Fungal mitochondria
sar16s <- sar16s[,!(names(sar16s) %in% "4a0b292ba716582f9af46694458c0b9b")] ## Chloroplast
sar16s_tax <- subset(sar16s_tax, row.names(sar16s_tax) %in% colnames(sar16s))

sar16s <- sar16s[,reorder(colnames(sar16s), order(colnames(sar16s)))]
sar16s_tax <- sar16s_tax[reorder(row.names(sar16s_tax), order(row.names(sar16s_tax))),]
sar16s_names <- paste("16S",sprintf('%0.4d', 1:2681), sep = "")
namingconv_16s <- data.frame(colnames(sar16s), sar16s_names)
colnames(namingconv_16s) <- c("Original", "New")
colnames(sar16s) <- sar16s_names
row.names(sar16s_tax) <- sar16s_names

sar18s <- sar18s[,reorder(colnames(sar18s), order(colnames(sar18s)))]
sar18s_tax <- sar18s_tax[reorder(row.names(sar18s_tax), order(row.names(sar18s_tax))),]
sar18s_names <- paste("18S",sprintf('%0.4d', 1:1376), sep = "")
namingconv_18s <- data.frame(colnames(sar18s), sar18s_names)
colnames(namingconv_18s) <- c("Original", "New")
colnames(sar18s) <- sar18s_names
row.names(sar18s_tax) <- sar18s_names

#Make table of ASVs relevant to manuscript (Supplementary Table 2)
relevant_ASVs <- rbind(namingconv_16s[c(740,1049,1385,1555,2175,2662),],
      namingconv_18s[c(46,139,144,157,198,204,251),])
row.names(relevant_ASVs) <- 1:13
write.csv(relevant_ASVs, "relevant_ASVs.csv")


##### Test effects of rarefaction on alpha diversity ####

## iNEXT estimateD to test effects of rarefaction to lowest sample sequence count on alpha diversity
# sar16S.estD <- estimateD(t(sar16s), datatype = "abundance", base="size") ## This takes a very long time to run so csv is provided
# write.csv(sar16S.estD, "Sarracenia_16s_estimateD_data.csv", row.names=F)
sar16S.estD <- read.csv("Sarracenia_16s_estimateD_data.csv")
sar16S.estD1 <- subset(sar16S.estD, sar16S.estD$order == 1)
sar16S.estD1 <- cbind(sample=c(1:160),sar16S.estD1)

## Supplementary Figure 2A
ggplot(data = sar16S.estD1, mapping = aes(x = sample, y = qD)) + 
  geom_point() +
  geom_errorbar(aes(x = sample, ymin = qD.LCL, ymax = qD.UCL)) +
  ylab("Effective Number of Species (16S)") + xlab("Sample") + theme_classic()


# sar18S.estD <- estimateD(t(sar18s), datatype = "abundance", base="size") ## This takes a very long time to run so csv is provided
# write.csv(sar18S.estD, "Sarracenia_18s_estimateD_data.csv", row.names=F)
sar18S.estD <- read.csv("Sarracenia_18s_estimateD_data.csv")
sar18S.estD1 <- subset(sar18S.estD, sar18S.estD$order == 1)
sar18S.estD1 <- cbind(sample=c(1:160),sar18S.estD1)

## Supplementary Figure 2B
ggplot(data = sar18S.estD1, mapping = aes(x = sample, y = qD)) + 
  geom_point() +
  geom_errorbar(aes(x = sample, ymin = qD.LCL, ymax = qD.UCL)) +
  ylab("Effective Number of Species (18S)") + xlab("Sample") + theme_classic()

#Rarefy####
## Rarefaction plots for Supplementary Figure 1
#Rarefy to lowest ASV count for a single sample
set.seed(111)
#rarefaction curve
rarecurve(sar16s, step = 10, label = FALSE, main = "16S rarefaction")
abline(v = 3910, col = "red", lwd = 2)

sar16s.r <- data.frame(rrarefy(sar16s, sample = 3910), check.names = FALSE)
print(min(rowSums(sar16s))) #lowest ASV count in 16s data, 3910

set.seed(111)
#rarefaction curve
rarecurve(sar18s, step = 10, main = "18S rarefaction", label = FALSE)
abline(v = 411, col = "red", lwd = 2)

sar18s.r <- data.frame(rrarefy(sar18s, sample = 411), check.names = FALSE)
print(min(rowSums(sar18s))) #lowest ASV count in 18s data, 411

#Remove any ASVs below desired threshold (default set at 0)
sar16s.r <- sar16s.r[,colSums(sar16s.r) > 0]
sar18s.r <- sar18s.r[,colSums(sar18s.r) > 0]

#Subset tax tables and names objects to match rarefied tables
sar16s.r_tax <- subset(sar16s_tax, row.names(sar16s_tax) %in% colnames(sar16s.r))
namingconv_16s.r <- subset(namingconv_16s, namingconv_16s$New %in% colnames(sar16s.r))
sar18s.r_tax <- subset(sar18s_tax, row.names(sar18s_tax) %in% colnames(sar18s.r))
namingconv_18s.r <- subset(namingconv_18s, namingconv_18s$New %in% colnames(sar18s.r))



#Create family abundance tables####
sar16s.r_family <- c()
for (a in unique(sar16s.r_tax$Family)){
  sar16s.r_family[a] <- data.frame(colSums(subset(as.data.frame(t(sar16s.r)), sar16s.r_tax$Family %in% a)))
}
sar16s.r_family <- data.frame(sar16s.r_family, check.names = FALSE)
row.names(sar16s.r_family) <- row.names(sar16s.r)


##Alpha Diversity ####
#Calculate alpha diversity, add to metadata
mdat$shannon.16s <- diversity(sar16s.r) 
mdat$shannon.18s <- diversity(sar18s.r)

mdat$effective.16s <- round(exp(mdat$shannon.16s))
mdat$effective.18s <- round(exp(mdat$shannon.18s))

#Generalized Linear Models####
#16s
#Remove S. purpurea (not in Gulf Coast)
sar_sbst <- subset(sar16s.r, mdat$Host_Spp != "purpurea")
mdat_sbst <- subset(mdat, mdat$Host_Spp != "purpurea")

#16s - using all factors, Supplementary Table 3
glm_16sall <- glm(effective.16s ~ Host_Spp + Site + pH + Volume + Latitude + Longitude, family = 'poisson', data = mdat_sbst)
#Supplementary table
write.csv(summary(glm_16sall)$coefficients, "supplemtal_glm_16s.csv")

## Significant = rosea, leuco-rosea, all sites except weeks bay, all other factors
plot(allEffects(glm_16sall))

## 16s - omnibus test
glm_16sall_reduced <- glm(effective.16s ~ Site + pH + Volume + Latitude + Longitude, family = 'poisson', data = mdat_sbst)
glm_16sall_reduced2 <- glm(effective.16s ~ Host_Spp + pH + Volume + Latitude + Longitude, family = 'poisson', data = mdat_sbst)
anova(glm_16sall_reduced, glm_16sall, test = 'Chisq')
anova(glm_16sall_reduced2, glm_16sall, test = 'Chisq')


#18s
sar_sbst <- subset(sar18s.r, mdat$Host_Spp != "purpurea")

#18s - using all factors, Supplementary Table 4
glm_18sall <- glm(effective.18s ~ Host_Spp + Site + pH + Volume + Latitude + Longitude, family = 'poisson', data = mdat_sbst)
summary(glm_18sall)


## No significant results
plot(allEffects(glm_18sall))

## 18s - omnibus test
glm_18sall_reduced <- glm(effective.18s ~ Site + pH + Volume + Latitude + Longitude, family = 'poisson', data = mdat_sbst)
glm_18sall_reduced2 <- glm(effective.18s ~ Host_Spp + pH + Volume + Latitude + Longitude, family = 'poisson', data = mdat_sbst)
anova(glm_18sall_reduced, glm_18sall, test = 'Chisq')
anova(glm_18sall_reduced2, glm_18sall, test = 'Chisq')


##Alpha Diversity Plots
#Figure 1B - 16s Alpha Diversity by Host Species#### 
#Subset metadata to remove hybrid host species before making figure
mdat.s <- subset(mdat, mdat$Host_Spp %in% c("alata","flava","leucophylla","purpurea","rosea","rubra"))
#specify color palette by host species
colors1 <- c("alata" = "blue", "flava" = "forestgreen", "leucophylla" = "cyan3", "purpurea" = "purple", "rosea" = "red", "rubra" = "gold1")
#plot  
ggplot(data = mdat.s, mapping = aes(x = Host_Spp, y = effective.16s, color = Host_Spp)) + 
  geom_violin(trim = FALSE, draw_quantiles = .5) + geom_jitter(width = 0.1, alpha = .5) +
  scale_color_manual(values = colors1) + theme_classic() +
  ylab("Effective Number of Species") + xlab("Host Species") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")

#Figure 1C - 18s Alpha Diversity by Host Species#### 
#plot
ggplot(data = mdat.s, mapping = aes(x = Host_Spp, y = effective.18s, color = Host_Spp)) + 
  geom_violin(trim = FALSE, draw_quantiles = .5) + geom_jitter(width = 0.1, alpha = .5) +
  scale_color_manual(values = colors1) + theme_classic() +
  ylab("Effective Number of Species") + xlab("Host Species") + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")

#Figure Splinter Hill 2A - 16s Splinter Hill Alpha Diversity by Host Species####
mdat.s.s <- subset(mdat, mdat$Site %in% "Splinter_Hill")
#order levels
mdat.s.s$Host_Spp <- factor(mdat.s.s$Host_Spp, levels = c("leucophylla", "leuco-rosea", "rosea", "rubra"))
#plot
ggplot(data = mdat.s.s, mapping = aes(x = Host_Spp, y = effective.16s, color = Host_Spp)) + 
  geom_violin(trim = FALSE, draw_quantiles = .5) + geom_jitter(width = 0.1, alpha = .5) +
  scale_color_manual(values = c("gold","darkorange1","red","cyan")) + theme_classic() +
  ylab("Effective Number of Species") + xlab("Host Species") + ylim(0, 80) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")

#Figure Splinter Hill 2B - 18s Splinter Hill Alpha Diversity by Host Species####
ggplot(data = mdat.s.s, mapping = aes(x = Host_Spp, y = effective.18s, color = Host_Spp)) + 
  geom_violin(trim = FALSE, draw_quantiles = .5) + geom_jitter(width = 0.1, alpha = .5) +
  scale_color_manual(values = c("red","darkorange1","gold","cyan")) + theme_classic() +
  ylab("Effective Number of Species") + xlab("Host Species") + ylim(0, 35) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")

#Figure 3A - 16s Leucophylla Alpha Diversity by Site####
#Subset data to only samples that have leucophylla 
mdat.l.s <- subset(mdat, mdat$Host_Spp %in% "leucophylla")
#make object of leucophylla sites in geographic order
site_geog <- c("Weeks_Bay", "Splinter_Hill", "Blackwater", "Eglin_Base_YR", "Eglin_Base", "Nokuse_Bog")
#plot
ggplot(data = mdat.l.s, mapping = aes(x = factor(Site, level = site_geog), y = effective.16s)) + 
  geom_violin(trim = FALSE, draw_quantiles = .5) + geom_jitter(width = 0.1) +
  ylab("Effective Number of Species") + xlab("Site") + theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

#Figure 3B - 18s Leucophylla Alpha Diversity by Site####
ggplot(data = mdat.l.s, mapping = aes(x = factor(Site, level = site_geog), y = effective.18s)) + 
  geom_violin(trim = FALSE, draw_quantiles = .5) + geom_jitter(width = 0.1) +
  ylab("Effective Number of Species") + xlab("Site") + theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))


  
##### Beta Diversity Analyses ####
#Load phylogenetic trees and generate Unifrac distance objects ####
#16s
#Load phylogenetic tree
phylo.16s <- read.tree("16S_Tree_GC_Sarracenia.tre")

#Merge ASV table and pruned phylo tree, create Phyloseq object
#use original ASV names for this step
colnames(sar16s.r) <- namingconv_16s.r$Original
physeq.16s <- merge_phyloseq(otu_table(t(sar16s.r), taxa_are_rows = T), 
                             prune.sample(sar16s.r, phylo.16s))
colnames(sar16s.r) <- namingconv_16s.r$New

#Calculate weighted Unifrac distance 
dist.16s <- phyloseq::distance(physeq.16s, method = "wunifrac")

#18s
#Load phlogenetic tree
phylo.18s <- read.tree("18S_Tree_GC_Sarracenia.tre")
phylo.18s <- ape::multi2di(phylo.18s) ### Resolve multichotomies

#Merge ASV table and pruned phylo tree, create Phyloseq object
colnames(sar18s.r) <- namingconv_18s.r$Original
physeq.18s <- merge_phyloseq(otu_table(t(sar18s.r), taxa_are_rows = TRUE), prune.sample(sar18s.r, phylo.18s))
colnames(sar18s.r) <- namingconv_18s.r$New

#Calculate weighted Unifrac distance 
dist.18s <- phyloseq::distance(physeq.18s, method = "wunifrac")

#PERMANOVA and Mantel for both 16s and 18s data-sets####
#16s
#PERMANOVA - without hybrids
sar16s.s.r <- subset(sar16s.r, row.names(sar16s.r) %in% row.names(mdat.s))
names16s.s <- subset(namingconv_16s.r, namingconv_16s.r$New %in% colnames(sar16s.s.r))
colnames(sar16s.s.r) <- names16s.s$Original
physeq.s.16s <- merge_phyloseq(otu_table(t(sar16s.s.r), taxa_are_rows = T),prune.sample(sar16s.s.r, phylo.16s))
dist.s.16s <- phyloseq::distance(physeq.s.16s, method = "wunifrac")

perm.s.16s.SH <- adonis2(dist.s.16s ~ Site * Host_Spp, data = mdat.s, by = "margin") 
perm.s.16s.H <- adonis(dist.s.16s ~ Host_Spp, data = mdat.s) 
perm.s.16s.S <- adonis(dist.s.16s ~ Site, data = mdat.s) 

#Mantel test
factorlist <- colnames(mdat.s)[7:10]
mantdf.16s <- c()
for (i in factorlist){
  #Mantel
  mant <- mantel(dist.s.16s, vegdist(mdat.s[[i]], method="euclidean"), permutations=999) 
  #Generate mantel stat tables for all species entered
  mantdf.16s[[i]] <- c("Statistic"=mant[["statistic"]],"Significance"=mant[["signif"]])
}
mantdf.16s <- data.frame(mantdf.16s)
mantdf.16s
#pairwise adonis
pairperm.s.16s.H <- pairwise.adonis(dist.s.16s, mdat.s$Host_Spp, p.adjust.m='holm')
pairperm.s.16s.H
write.csv(pairperm.s.16s.H, "pairperm_s_16s_H.csv")
pairperm.s.16s.S <- pairwise.adonis(dist.s.16s, mdat.s$Host_Spp, p.adjust.m='holm')
pairperm.s.16s.S
write.csv(pairperm.s.16s.S, "pairperm_s_16s_S.csv")

#18s
#PERMANOVA - without hybrids
sar18s.s.r <- subset(sar18s.r, row.names(sar18s.r) %in% row.names(mdat.s))
names18s.s <- subset(namingconv_18s.r, namingconv_18s.r$New %in% colnames(sar18s.s.r))
colnames(sar18s.s.r) <- names18s.s$Original
physeq.s.18s <- merge_phyloseq(otu_table(t(sar18s.s.r), taxa_are_rows = T), 
                               prune.sample(sar18s.s.r, phylo.18s))
dist.s.18s <- phyloseq::distance(physeq.s.18s, method = "wunifrac")

perm.s.18s.SH <- adonis2(dist.s.18s ~ Site * Host_Spp, data = mdat.s, by = "margin") 
perm.s.18s.H <- adonis(dist.s.18s ~ Host_Spp, data = mdat.s)
perm.s.18s.S <- adonis(dist.s.18s ~ Site, data = mdat.s)
#pairwise adonis
pairperm.s.18s.H <- pairwise.adonis(dist.s.18s, mdat.s$Host_Spp, p.adjust.m='holm')
pairperm.s.18s.H
write.csv(pairperm.s.18s.H, "pairperm_s_18s_H.csv")

pairperm.s.18s.S<- pairwise.adonis(dist.s.18s, mdat.s$Host_Spp, p.adjust.m='holm')
pairperm.s.18s.S
write.csv(pairperm.s.18s.S, "pairperm_s_18s_S.csv")

#Mantel test
mantdf.18s <- c()
for (i in factorlist){
  #Mantel
  mant <- mantel(dist.s.18s, vegdist(mdat.s[[i]], method="euclidean"), permutations=999) 
  #Generate mantel stat tables for all species entered
  mantdf.18s[[i]] <- c("Statistic"=mant[["statistic"]],"Significance"=mant[["signif"]])
}
mantdf.18s <- data.frame(mantdf.18s)
mantdf.18s

#Create p-adjusted beta diversity table for both 16s and 18s data-set (all species except hybrids) ####
betadiv.t1 <- t(data.frame("16s.Stat/R2" = c(perm.s.16s.SH$R2[1],perm.s.16s.H$aov.tab$R2[1],perm.s.16s.S$aov.tab$R2[1], as.numeric(mantdf.16s[1,])),
                           "16s.Significance" = p.adjust(c(perm.s.16s.SH$`Pr(>F)`[1],perm.s.16s.H$aov.tab$`Pr(>F)`[1],perm.s.16s.S$aov.tab$`Pr(>F)`[1], as.numeric(mantdf.16s[2,])), "holm"),
                           "18s.Stat/R2" = c(perm.s.18s.SH$R2[1],perm.s.18s.H$aov.tab$R2[1],perm.s.18s.S$aov.tab$R2[1], as.numeric(mantdf.18s[1,])),
                           "18s.Significance" = p.adjust(c(perm.s.18s.SH$`Pr(>F)`[1],perm.s.18s.S$aov.tab$`Pr(>F)`[1],perm.s.18s.H$aov.tab$`Pr(>F)`[1], as.numeric(mantdf.18s[2,])), "holm"),
                           check.names = F, row.names = c("Host Species + Site","Host Species", "Site", "pH", "Volume", "Latitude", "Longitude")))
#Round to sig figs
betadiv.t1 <- round(betadiv.t1, digits = 3)

#export for Table 1
write.csv(betadiv.t1, file = "all_permmantel_holm_unifrac.csv")

#Beta diversity table subset by Host Species (16s or 18s must be specified)####
#This function will subset data by all species specified 
#and run mantel tests for all specified factors
#PERMANOVA run for Site automatically if # Sites > 1
host_betadiv <- function(species, factor, data, tree){
  
  #create output object 
  df.a <- c()

  #for loop will: 
  #repeat for every species entered
  #calculate PERMANOVA for Site if #Sites > 1
  #calculate Mantel for every continuous variable entered
  #Output results w/ p values adjusted by holm method
  for (i in species){
    #Subset data-sets
    sar_sbst <- subset(data, mdat$Host_Spp %in% i)
    mdat_sbst <- subset(mdat, row.names(mdat) %in% row.names(sar_sbst))
    #Drop old factors from metadata
    mdat_sbst$Host_Spp <- mdat_sbst$Host_Spp[drop=T]
    
    #Unifrac
    #Load ASV table and pruned phylo tree, create phyloseq object
    physeq <- merge_phyloseq(otu_table(t(sar_sbst), taxa_are_rows = T), 
                             prune.sample(sar_sbst, tree))
    #Calculate weighted Unifrac
    uni <- phyloseq::distance(physeq, method = "wunifrac")
    
    #PERMANOVA with weighted UniFrac
    if(nlevels(factor(mdat_sbst$Site)) > 1){
      p <- adonis(uni ~ Site, data=mdat_sbst)
      #discard all PERMANOVA info except 'R2' and 'Pr(>F)'
      p <- data.frame(p$aov.tab$R2[[1]],p$aov.tab$'Pr(>F)'[[1]])
    }
    else{
      #if #Sites == 1, PERMANOVA is not run, data represented as 'NA' in output
      p <- data.frame("NA","NA")
    }
    
    #Mantel w/ Unifrac
    m <- c() #mantel tests storage object, resets w/ every new host species
    for (j in factor[1:length(factor)]){
      df.b <- c() #temporary storage object, resets for every variable
      
      #Mantel
      df.b <- mantel(uni, vegdist(mdat_sbst[[j]], method="euclidean"),
                     permutations=999)
      #retain Mantel statistic and significance, discard rest
      #m object will accumulate mantel data for every variable for current host species
      m[[j]] <- c("Statistic" = df.b[["statistic"]], "Significance" = df.b[["signif"]])
    }
    #convert mantel info to data.frame format
    m <- as.data.frame(m)
    #load all PERMANOVA/Mantel results into output object
    #Statistic/R2
    df.a[[paste(i, "_stat/R2", sep = "")]] <- as.numeric(c(m[1,], p[1]))
    #p values
    #adjusts all p values using the "holm" method
    df.a[[paste(i, "_pvalue", sep = "")]] <- p.adjust(as.numeric(c(m[2,], p[2])), method = "holm")
  }
  #after all host species have run, convert to output to data.frame format
  df.a <- as.data.frame(df.a)
  #assign row names for all variables tested
  row.names(df.a) <- c(factor, "Site")
  #output results
  return(t(df.a))
}

#Generate 16s Table 3
colnames(sar16s.r) <- namingconv_16s.r$Original
hostbeta.16s.df <- host_betadiv(species = c("alata", "flava", "leucophylla", "purpurea","rosea", "rubra"),
                       factor = c("Volume", "pH", "Latitude", "Longitude"),
                       data = sar16s.r, tree = phylo.16s)
colnames(sar16s.r) <- namingconv_16s.r$New

#Save for Table 3
write.csv(hostbeta.16s.df, file = "16s_perm-mantel_holmadj_host_unifrac.csv")

#Generate 18s table 4
colnames(sar18s.r) <- namingconv_18s.r$Original
hostbeta.18s.df <- host_betadiv(species = c("alata", "flava", "leucophylla", "purpurea","rosea","rubra"),
                            factor = c("Volume", "pH", "Latitude", "Longitude"),
                            data = sar18s.r, tree = phylo.18s)
colnames(sar18s.r) <- namingconv_18s.r$New

#Save for Table 4
write.csv(hostbeta.18s.df, file = "18s_perm-mantel_holmadj_host_unifrac.csv")

#Beta diversity table subset by Site (16s or 18s must be specified)####
#This function will subset data by all sites specified 
#and run mantel tests for all specified factors
#PERMANOVA run for Host Species automatically if # Host Species > 1
site_betadiv <- function(sites, factor, data, tree){
  
  #create output object 
  df.a <- c()
  
  #for loop will: 
  #repeat for every site entered
  #calculate PERMANOVA for Site if #Host Species > 1
  #calculate Mantel for every continuous variable entered
  #Output results w/ p values adjusted by holm method
  for (i in sites){
    #Subset data-sets
    sar_sbst <- subset(data, mdat$Site %in% i)
    mdat_sbst <- subset(mdat, row.names(mdat) %in% row.names(sar_sbst))
    #Drop old factors from metadata
    mdat_sbst$Site <- mdat_sbst$Site[drop=T]
    
    #Unifrac
    #Load ASV table and pruned phylo tree, create phyloseq object
    physeq <- merge_phyloseq(otu_table(t(sar_sbst), taxa_are_rows = T), 
                             prune.sample(sar_sbst, tree))
    #Calculate weighted Unifrac
    uni <- phyloseq::distance(physeq, method = "wunifrac")
    
    #PERMANOVA with weighted UniFrac
    if(nlevels(factor(mdat_sbst$Host_Spp)) > 1){
      p <- adonis(uni ~ Host_Spp, data=mdat_sbst, method = "")
      #discard all PERMANOVA info except 'R2' and 'Pr(>F)'
      p <- data.frame(p$aov.tab$R2[[1]],p$aov.tab$'Pr(>F)'[[1]])
    }
    else{
      #if #Host Species == 1, PERMANOVA is not run, data represented as 'NA' in output
      p <- data.frame("NA","NA")
    }
    
    #Mantel w/ Unifrac
    m <- c() #mantel tests storage object, resets w/ every new site
    for (j in factor[1:length(factor)]){
      df.b <- c() #temporary storage object, resets for every variable
      
      #Mantel
      df.b <- mantel(uni, vegdist(mdat_sbst[[j]], method="euclidean"),
                     permutations=999)
      #retain Mantel statistic and significance, discard rest
      #m object will accumulate mantel data for every variable for current site
      m[[j]] <- c("Statistic" = df.b[["statistic"]], "Significance" = df.b[["signif"]])
    }
    #convert mantel info to data.frame format
    m <- as.data.frame(m)
    #load all PERMANOVA/Mantel results into output object
    #Statistic/R2
    df.a[[paste(i, "_stat/R2", sep = "")]] <- as.numeric(c(m[1,], p[1]))
    #p values
    #adjusts all p values using the "holm" method
    df.a[[paste(i, "_pvalue", sep = "")]] <- p.adjust(as.numeric(c(m[2,], p[2])), method = "holm")
  }
  #after all host species have run, convert to output to data.frame format
  df.a <- as.data.frame(df.a)
  #assign row names for all variables tested
  row.names(df.a) <- c(factor, "Host_Species")
  #output results
  return(t(df.a))
}

#Generate table
colnames(sar16s.r) <- namingconv_16s.r$Original
sitebeta.16s.df <- site_betadiv(sites = c("Blackwater","Splinter_Hill", "Crystal_Bog", "Plea_Phase_Savanna", "Eglin_Base"),
                            factor = c("Volume", "pH"),
                            data = sar16s.r, tree = phylo.16s)
colnames(sar16s.r) <- namingconv_16s.r$New

#Save for Table 2
write.csv(sitebeta.16s.df, file = "16s_perm-mantel_holmadj_site_unifrac.csv")

#Generate 18s table
colnames(sar18s.r) <- namingconv_18s.r$Original
sitebeta.18s.df <- site_betadiv(sites = c("Blackwater","Splinter_Hill", "Crystal_Bog", "Plea_Phase_Savanna", "Eglin_Base"),
                            factor = c("Volume", "pH"),
                            data = sar18s.r, tree = phylo.18s)
colnames(sar18s.r) <- namingconv_18s.r$New

#Save for Table 2
write.csv(sitebeta.18s.df, file = "18s_perm-mantel_holmadj_site_unifrac.csv")



#NMDS - 16s & 18s, no hybrids####

#run NMDS for 16s
set.seed(111)
nmds.16s <- vegan::metaMDS(dist.s.16s, k = 2, trymax=500)

set.seed(257)
nmds.18s <- vegan::metaMDS(dist.s.18s, k=3, trymax=1000)

#Figure 1D - 16s species highlighted####
plot(nmds.16s$points, xlab="NMDS Axis 1", ylab="NMDS Axis 2", 
     col = c("blue", "forestgreen","turquoise", "purple", "red", "gold")[as.factor(mdat.s$Host_Spp)],
     pch=19)
ordispider(nmds.16s$points, groups=mdat.s$Host_Spp, show.groups="alata", col=adjustcolor("blue",0.2), spiders = "median", lwd=3)
ordispider(nmds.16s$points, groups=mdat.s$Host_Spp, show.groups="flava", col=adjustcolor("forestgreen",0.2), spiders = "median", lwd=3)
ordispider(nmds.16s$points, groups=mdat.s$Host_Spp, show.groups="leucophylla", col=adjustcolor("turquoise",0.2), spiders = "median", lwd=3)
ordispider(nmds.16s$points, groups=mdat.s$Host_Spp, show.groups="purpurea", col=adjustcolor("purple",0.4), spiders = "median", lwd=3)
ordispider(nmds.16s$points, groups=mdat.s$Host_Spp, show.groups="rosea", col=adjustcolor("red",0.2), spiders = "median", lwd=3)
ordispider(nmds.16s$points, groups=mdat.s$Host_Spp, show.groups="rubra", col=adjustcolor("gold",0.4), spiders = "median", lwd=3)
legend("topleft", unique(mdat.s$Host_Spp), fill= c("blue", "forestgreen","turquoise", "purple", "red", "gold"),
       bty="n", cex = 1)

palette <- c("#88CCEE", "#CC6677", "#DDCC77", "#117733", "#332288", "#AA4499", "#44AA99", "#999933", "#882255", "#661100", "#6699CC", "#888888", "#FFA500", "#FF0000")
string <- sort(as.character(unique(as.factor(mdat.s$Site))))

# Site version of Figure 1D - Supplementary Figure 3
plot(nmds.16s$points, xlab="NMDS Axis 1", ylab="NMDS Axis 2", 
     col = palette[as.factor(mdat.s$Site)], pch=19, main = "16S NMDS")
legend("bottomleft", string, fill= palette,
       bty="n", cex = .75)
plot(envfit(ord = nmds.16s$points, env = mdat.s$Volume),
     labels = "Volume", col = "red", cex = .75)
plot(envfit(ord = nmds.16s$points, env = mdat.s$pH),
     labels = "pH", col = "red", cex = .75)
plot(envfit(ord = nmds.16s$points, env = mdat.s$Latitude),
     labels = "Latitude", col = "red", cex = .75)
plot(envfit(ord = nmds.16s$points, env = mdat.s$Longitude),
     labels = "Longitude", col = "red", cex = .75)

# Site version of Figure 1E - Supplementary Figure 4
plot(nmds.18s$points, xlab="NMDS Axis 1", ylab="NMDS Axis 2", 
     col = palette[as.factor(mdat.s$Site)], pch=19, main = "18S NMDS")
legend("bottomleft", string, fill= palette,
       bty="n", cex = .75)


#Figure 1E - 18s species highlighted####
plot(nmds.18s$points, xlab="NMDS Axis 1", ylab="NMDS Axis 2", 
     col= c("blue", "forestgreen","turquoise", "purple", "red", "gold")[as.factor(mdat.s$Host_Spp)],
     pch=19)
ordispider(nmds.18s$points, groups=mdat.s$Host_Spp, show.groups="alata", col=adjustcolor("blue",0.2), spiders = "median", lwd=3)
ordispider(nmds.18s$points, groups=mdat.s$Host_Spp, show.groups="flava", col=adjustcolor("forestgreen",0.2), spiders = "median", lwd=3)
ordispider(nmds.18s$points, groups=mdat.s$Host_Spp, show.groups="leucophylla", col=adjustcolor("turquoise",0.2), spiders = "median", lwd=3)
ordispider(nmds.18s$points, groups=mdat.s$Host_Spp, show.groups="purpurea", col=adjustcolor("purple",0.4), spiders = "median", lwd=3)
ordispider(nmds.18s$points, groups=mdat.s$Host_Spp, show.groups="rosea", col=adjustcolor("red",0.2), spiders = "median", lwd=3)
ordispider(nmds.18s$points, groups=mdat.s$Host_Spp, show.groups="rubra", col=adjustcolor("gold",0.4), spiders = "median", lwd=3)
legend("bottomleft", unique(mdat.s$Host_Spp), fill= c("blue", "forestgreen","turquoise", "purple", "red", "gold"),
       bty="n", cex = 1)

#NMDS - 16s & 18s; Splinter Hill only####
#Subset data
sar16s.s.s <- subset(sar16s.r, rownames(sar16s.r) %in% rownames(mdat.s.s))
sar18s.s.s <- subset(sar18s.r, rownames(sar18s.r) %in% rownames(mdat.s.s))
mdat.s.s <- droplevels(mdat.s.s)

#create physeq object
physeq.s.16s <- merge_phyloseq(otu_table(t(sar16s.s.s), taxa_are_rows = T), 
                               prune.sample(sar16s.s.s, phylo.16s))
#Calculate weighted Unifrac distance
dist.s.16s <- phyloseq::distance(physeq.s.16s, method = "wunifrac")
#run NMDS
set.seed(111)
nmds.s.16s <- vegan::metaMDS(dist.s.16s, k = 2, trymax=500)
## Stress is 0.11

#18s
#Load ASV table and pruned phylo tree, create Unifrac object
physeq.s.18s <- merge_phyloseq(otu_table(t(sar18s.s.s), taxa_are_rows = T), 
                               prune.sample(sar18s.s.s, phylo.18s))
#Calculate weighted Unifrac distance
dist.s.18s <- phyloseq::distance(physeq.s.18s, method = "wunifrac")
#run NMDS
set.seed(111)
nmds.s.18s <- vegan::metaMDS(dist.s.18s, k=2, trymax=500)
## Stress is 0.13


#Figure Spinter Hill 2C - 16s NMDS; Host Species highlighted####
plot(nmds.s.16s$points, xlab="nmds.s Axis 1", ylab="nmds.s Axis 2", 
     col = c("darkorange1","gold","red","cyan")[as.factor(mdat.s.s$Host_Spp)],
     pch=19)
ordispider(nmds.s.16s$points, groups=mdat.s.s$Host_Spp, show.groups="rosea", col=adjustcolor("red",0.2), spiders = "median", lwd=3)
ordispider(nmds.s.16s$points, groups=mdat.s.s$Host_Spp, show.groups="leucophylla", col=adjustcolor("gold",0.2), spiders = "median", lwd=3)
ordispider(nmds.s.16s$points, groups=mdat.s.s$Host_Spp, show.groups="leuco-rosea", col=adjustcolor("darkorange1",0.4), spiders = "median", lwd=3)
ordispider(nmds.s.16s$points, groups=mdat.s.s$Host_Spp, show.groups="rubra", col=adjustcolor("cyan",0.4), spiders = "median", lwd=3)
legend("topleft", as.character(unique(mdat.s.s$Host_Spp)), fill= c("gold", "darkorange1","red", "cyan"),
       bty="n", cex = 1)


#Figure Spinter Hill 2D - 18s NMDS; Host Species highlighted####
plot(nmds.s.18s$points, xlab="nmds.s Axis 1", ylab="nmds.s Axis 2", 
     col = c("darkorange1","gold","red","cyan")[as.factor(mdat.s.s$Host_Spp)],
     pch=19)
ordispider(nmds.s.18s$points, groups=mdat.s.s$Host_Spp, show.groups="rosea", col=adjustcolor("red",0.2), spiders = "median", lwd=3)
ordispider(nmds.s.18s$points, groups=mdat.s.s$Host_Spp, show.groups="leucophylla", col=adjustcolor("gold",0.2), spiders = "median", lwd=3)
ordispider(nmds.s.18s$points, groups=mdat.s.s$Host_Spp, show.groups="leuco-rosea", col=adjustcolor("darkorange1",0.4), spiders = "median", lwd=3)
ordispider(nmds.s.18s$points, groups=mdat.s.s$Host_Spp, show.groups="rubra", col=adjustcolor("cyan",0.4), spiders = "median", lwd=3)
legend("topleft", as.character(unique(mdat.s.s$Host_Spp)), fill= c("gold","darkorange1","red","cyan"),
       bty="n", cex = 1)

#NMDS - 16s & 18s, leucophylla only####
#Subset data
sar16s.l.s <- subset(sar16s.r, rownames(sar16s.r) %in% rownames(mdat.l.s))
sar18s.l.s <- subset(sar18s.r, rownames(sar18s.r) %in% rownames(mdat.l.s))
mdat.l.s$Host_Spp <- mdat.l.s$Host_Spp[drop=T]

#create physeq object
physeq.l.16s <- merge_phyloseq(otu_table(t(sar16s.l.s), taxa_are_rows = T), 
                             prune.sample(sar16s.l.s, phylo.16s))
#Calculate weighted Unifrac distance
dist.l.16s <- phyloseq::distance(physeq.l.16s, method = "wunifrac")
#run NMDS
set.seed(111)
nmds.l.16s <- vegan::metaMDS(dist.l.16s, k = 2, trymax=1000)
## Stress is 0.12

#18s
#Load ASV table and pruned phylo tree, create Unifrac object
physeq.l.18s <- merge_phyloseq(otu_table(t(sar18s.l.s), taxa_are_rows = T), 
                             prune.sample(sar18s.l.s, phylo.18s))
#Calculate weighted Unifrac distance
dist.l.18s <- phyloseq::distance(physeq.l.18s, method = "wunifrac")
#run NMDS
set.seed(111)
nmds.l.18s <- vegan::metaMDS(dist.l.18s, k=2, trymax=1000)
## Stress is 0.085


#Figure 3C - Leucophylla 16s, Longitude overlaid####
ordiplot(nmds.l.16s, type = "point", display="sites",
         xlim=c(min(nmds.l.16s$points[,1]),max(nmds.l.16s$points[,1])),
         ylim=c(min(nmds.l.16s$points[,2]),max(nmds.l.16s$points[,2])))
with(mdat.l.s, ordisurf(nmds.l.16s,Longitude,bubble=T,cex=1.5,col="grey", main = NULL))

#Figure 3D - Leucophylla 18s, Longitude overlaid####
ordiplot(nmds.l.18s, type = "point", display="sites",
         xlim=c(min(nmds.l.16s$points[,1]),max(nmds.l.16s$points[,1])),
         ylim=c(min(nmds.l.16s$points[,2]),max(nmds.l.16s$points[,2])))
with(mdat.l.s, ordisurf(nmds.l.18s,Longitude,bubble=T,cex=1.5,col="grey", main = NULL))


##### NMDS and PERMANOVAs for hybrids and their parent species #####
#### flava-leucophylla ####
#NMDS - 16s & 18s; flava, leucophylla, flava-leucophylla only 
#Subset data 
sar16s.h1.s <- subset(sar16s.r, mdat$Host_Spp %in% c("flava","flava-leuco","leucophylla"))
sar18s.h1.s <- subset(sar18s.r, mdat$Host_Spp %in% c("flava","flava-leuco","leucophylla"))
mdat.h1.s <- subset(mdat, mdat$Host_Spp %in% c("flava","flava-leuco","leucophylla"))
mdat.h1.s$Host_Spp <- mdat.h1.s$Host_Spp[drop=T]

#create physeq object
physeq.h1.16s <- merge_phyloseq(otu_table(t(sar16s.h1.s), taxa_are_rows = T), 
                             prune.sample(sar16s.h1.s, phylo.16s))
#Calculate weighted Unifrac distance
dist.h1.16s <- phyloseq::distance(physeq.h1.16s, method = "wunifrac")

## PERMANOVA 
hybrid1.perm.16s <- adonis(formula = dist.h1.16s ~ Host_Spp, data = mdat.h1.s)
hybrid1.perm.16s

hybrid1.perm.16s.pw <- pairwise.adonis(dist.h1.16s, mdat.h1.s$Host_Spp, p.adjust.m='holm')
hybrid1.perm.16s.pw

#run NMDS
set.seed(111)
nmds.h1.16s <- vegan::metaMDS(dist.h1.16s, k = 2, trymax=1000)

#18s
#Load ASV table and pruned phylo tree, create Unifrac object
physeq.h1.18s <- merge_phyloseq(otu_table(t(sar18s.h1.s), taxa_are_rows = T), 
                             prune.sample(sar18s.h1.s, phylo.18s))
#Calculate weighted Unifrac distance
dist.h1.18s <- phyloseq::distance(physeq.h1.18s, method = "wunifrac")

#PERMANOVAs
hybrid1.perm.18s <- adonis(formula = dist.h1.18s ~ Host_Spp, data = mdat.h1.s)
hybrid1.perm.18s

hybrid1.perm.18s.pw <- pairwise.adonis(dist.h1.18s, mdat.h1.s$Host_Spp, p.adjust.m='holm')
hybrid1.perm.18s.pw

#run NMDS
set.seed(111)
nmds.h1.18s <- vegan::metaMDS(dist.h1.18s, k=2, trymax=500)

#Hybrids figure 4C - 16s NMDS; flava, leucophylla, flava-leucophylla
plot(nmds.h1.16s$points, xlab="nmds.h1 Axis 1", ylab="nmds.h1 Axis 2", 
     col = c("blue","forestgreen","gold")[as.factor(mdat.h1.s$Host_Spp)],
     pch=19)
ordispider(nmds.h1.16s$points, groups=mdat.h1.s$Host_Spp, show.groups="flava", col=adjustcolor("blue",0.2), spiders = "median", lwd=3)
ordispider(nmds.h1.16s$points, groups=mdat.h1.s$Host_Spp, show.groups="leucophylla", col=adjustcolor("gold",0.2), spiders = "median", lwd=3)
ordispider(nmds.h1.16s$points, groups=mdat.h1.s$Host_Spp, show.groups="flava-leuco", col=adjustcolor("forestgreen",0.4), spiders = "median", lwd=3)
legend("topleft", unique(mdat.h1.s$Host_Spp), fill= c("blue", "forestgreen","gold"),
       bty="n", cex = 1)

#Hybrids figure 4I - 18s NMDS; flava, leucophylla, flava-leucophylla####
plot(nmds.h1.18s$points, xlab="nmds.h1 Axis 1", ylab="nmds.h1 Axis 2", 
     col = c("blue","forestgreen","gold")[as.factor(mdat.h1.s$Host_Spp)],
     pch=19)
ordispider(nmds.h1.18s$points, groups=mdat.h1.s$Host_Spp, show.groups="flava", col=adjustcolor("blue",0.2), spiders = "median", lwd=3)
ordispider(nmds.h1.18s$points, groups=mdat.h1.s$Host_Spp, show.groups="leucophylla", col=adjustcolor("gold",0.2), spiders = "median", lwd=3)
ordispider(nmds.h1.18s$points, groups=mdat.h1.s$Host_Spp, show.groups="flava-leuco", col=adjustcolor("forestgreen",0.4), spiders = "median", lwd=3)
legend("topright", unique(mdat.h1.s$Host_Spp), fill= c("blue", "forestgreen","gold"),
       bty="n", cex = 1)

#NMDS and PERMANOVA 16s & 18s; flava, rosea, flava-rosea only####
#Subset data 
sar16s.h2.s <- subset(sar16s.r, mdat$Host_Spp %in% c("flava","flava-rosea","rosea"))
sar18s.h2.s <- subset(sar18s.r, mdat$Host_Spp %in% c("flava","flava-rosea","rosea"))
mdat.h2.s <- subset(mdat, mdat$Host_Spp %in% c("flava","flava-rosea","rosea"))
mdat.h2.s$Host_Spp <- mdat.h2.s$Host_Spp[drop=T]

#create physeq object
physeq.h2.16s <- merge_phyloseq(otu_table(t(sar16s.h2.s), taxa_are_rows = T), 
                             prune.sample(sar16s.h2.s, phylo.16s))
#Calculate weighted Unifrac distance
dist.h2.16s <- phyloseq::distance(physeq.h2.16s, method = "wunifrac")

### PERMANOVAs
hybrid.perm.h2.16s <- adonis(formula = dist.h2.16s ~ Host_Spp, data = mdat.h2.s)
hybrid.perm.h2.16s

hybrid.perm.h2.16s.pw <- pairwise.adonis(dist.h2.16s, mdat.h2.s$Host_Spp, p.adjust.m='holm')
hybrid.perm.h2.16s.pw

#run NMDS
set.seed(111)
nmds.h2.16s <- vegan::metaMDS(dist.h2.16s, k = 2, trymax=1000)

#18s
#Load ASV table and pruned phylo tree, create Unifrac object
physeq.h2.18s <- merge_phyloseq(otu_table(t(sar18s.h2.s), taxa_are_rows = T), 
                             prune.sample(sar18s.h2.s, phylo.18s))
#Calculate weighted Unifrac distance
dist.h2.18s <- phyloseq::distance(physeq.h2.18s, method = "wunifrac")

#PERMANOVAs
hybrid.perm.h2.18s <- adonis(formula = dist.h2.18s ~ Host_Spp, data = mdat.h2.s)
hybrid.perm.h2.18s

hybrid.perm.h2.18s.pw <- pairwise.adonis(dist.h2.18s, mdat.h2.s$Host_Spp, p.adjust.m='holm')
hybrid.perm.h2.18s.pw

#run NMDS
set.seed(111)
nmds.h2.18s <- vegan::metaMDS(dist.h2.18s, k=2, trymax=1000)

#Hybrids figure 4A - 16s NMDS; flava, rosea, flava-rosea####
plot(nmds.h2.16s$points, xlab="NMDS Axis 1", ylab="NMDS Axis 2", 
     col = c("blue","purple","red")[as.factor(mdat.h2.s$Host_Spp)],
     pch=19)
ordispider(nmds.h2.16s$points, groups=mdat.h2.s$Host_Spp, show.groups="flava", col=adjustcolor("blue",0.2), spiders = "median", lwd=3)
ordispider(nmds.h2.16s$points, groups=mdat.h2.s$Host_Spp, show.groups="rosea", col=adjustcolor("red",0.2), spiders = "median", lwd=3)
ordispider(nmds.h2.16s$points, groups=mdat.h2.s$Host_Spp, show.groups="flava-rosea", col=adjustcolor("purple",0.4), spiders = "median", lwd=3)
legend("bottomleft", unique(mdat.h2.s$Host_Spp), fill= c("blue", "purple", "red"),
       bty="n", cex = 1)
#Hybrids figure 4G - 18s NMDS; flava, rosea, flava-rosea####
plot(nmds.h2.18s$points, xlab="NMDS Axis 1", ylab="NMDS Axis 2", 
     col = c("blue","purple","red")[as.factor(mdat.h2.s$Host_Spp)],
     pch=19)
ordispider(nmds.h2.18s$points, groups=mdat.h2.s$Host_Spp, show.groups="flava", col=adjustcolor("blue",0.2), spiders = "median", lwd=3)
ordispider(nmds.h2.18s$points, groups=mdat.h2.s$Host_Spp, show.groups="rosea", col=adjustcolor("red",0.2), spiders = "median", lwd=3)
ordispider(nmds.h2.18s$points, groups=mdat.h2.s$Host_Spp, show.groups="flava-rosea", col=adjustcolor("purple",0.4), spiders = "median", lwd=3)
legend("topleft", unique(mdat.h2.s$Host_Spp), fill= c("blue", "purple","red"),
       bty="n", cex = 1)

#NMDS - 16s & 18s; leucophylla, leuco-rosea, rosea only####
#Subset data 
sar16s.h3.s <- subset(sar16s.r, mdat$Host_Spp %in% c("leucophylla", "leuco-rosea", "rosea"))
sar18s.h3.s <- subset(sar18s.r, mdat$Host_Spp %in% c("leucophylla", "leuco-rosea", "rosea"))
mdat.h3.s <- subset(mdat, mdat$Host_Spp %in% c("leucophylla", "leuco-rosea", "rosea"))
mdat.h3.s$Host_Spp <- mdat.h3.s$Host_Spp[drop=T]

#create physeq object
physeq.h2.16s <- merge_phyloseq(otu_table(t(sar16s.h3.s), taxa_are_rows = T), 
                             prune.sample(sar16s.h3.s, phylo.16s))
#Calculate weighted Unifrac distance
dist.h2.16s <- phyloseq::distance(physeq.h2.16s, method = "wunifrac")

### PERMANOVAs
hybrid.perm.h2.16s <- adonis(formula = dist.h2.16s ~ Host_Spp, data = mdat.h3.s)
hybrid.perm.h2.16s

hybrid.perm.h2.16s.pw <- pairwise.adonis(dist.h2.16s, mdat.h3.s$Host_Spp, p.adjust.m='holm')
hybrid.perm.h2.16s.pw

#run NMDS
set.seed(111)
nmds.h2.16s <- vegan::metaMDS(dist.h2.16s, k = 2, trymax=1000)

#18s
#Load ASV table and pruned phylo tree, create Unifrac object
physeq.h2.18s <- merge_phyloseq(otu_table(t(sar18s.h3.s), taxa_are_rows = T), 
                             prune.sample(sar18s.h3.s, phylo.18s))
#Calculate weighted Unifrac distance
dist.h2.18s <- phyloseq::distance(physeq.h2.18s, method = "wunifrac")

#PERMANOVAs
hybrid.perm.h2.18s <- adonis(formula = dist.h2.18s ~ Host_Spp, data = mdat.h3.s)
hybrid.perm.h2.18s

hybrid.perm.h2.18s.pw <- pairwise.adonis(dist.h2.18s, mdat.h3.s$Host_Spp, p.adjust.m='holm')
hybrid.perm.h2.18s.pw

#run NMDS
set.seed(111)
nmds.h2.18s <- vegan::metaMDS(dist.h2.18s, k=2, trymax=1000)

#Hybrids figure 4B - 16s NMDS; leucophylla, leuco-rosea, rosea####
plot(nmds.h2.16s$points, xlab="NMDS Axis 1", ylab="NMDS Axis 2", 
     col = c("darkorange1","gold","red")[as.factor(mdat.h3.s$Host_Spp)],
     pch=19)
ordispider(nmds.h2.16s$points, groups=mdat.h3.s$Host_Spp, show.groups="rosea", col=adjustcolor("red",0.2), spiders = "median", lwd=3)
ordispider(nmds.h2.16s$points, groups=mdat.h3.s$Host_Spp, show.groups="leucophylla", col=adjustcolor("gold",0.2), spiders = "median", lwd=3)
ordispider(nmds.h2.16s$points, groups=mdat.h3.s$Host_Spp, show.groups="leuco-rosea", col=adjustcolor("darkorange1",0.4), spiders = "median", lwd=3)
legend("topleft", unique(mdat.h3.s$Host_Spp), fill= c("gold","darkorange1","red"),
       bty="n", cex = 1)
#Hybrids figure 4H - 18s NMDS; leucophylla, leuco-rosea, rosea####
plot(nmds.h2.18s$points, xlab="NMDS Axis 1", ylab="NMDS Axis 2", 
     col = c("darkorange1","gold","red")[as.factor(mdat.h3.s$Host_Spp)],
     pch=19)
ordispider(nmds.h2.18s$points, groups=mdat.h3.s$Host_Spp, show.groups="rosea", col=adjustcolor("red",0.2), spiders = "median", lwd=3)
ordispider(nmds.h2.18s$points, groups=mdat.h3.s$Host_Spp, show.groups="leucophylla", col=adjustcolor("gold",0.2), spiders = "median", lwd=3)
ordispider(nmds.h2.18s$points, groups=mdat.h3.s$Host_Spp, show.groups="leuco-rosea", col=adjustcolor("darkorange1",0.4), spiders = "median", lwd=3)
legend("topleft", unique(mdat.h3.s$Host_Spp), fill= c("gold", "darkorange1", "red"),
       bty="n", cex = 1)


## ANCOM ####
#Detect ASVs playing significant role in driving trends for specified factor

#Load Function 'ANCOM.main' from S. Mandal (2017)####
ANCOM.main = function(OTUdat,Vardat,
                      adjusted,repeated,
                      main.var,adj.formula,
                      repeat.var,longitudinal,
                      random.formula,
                      multcorr,sig,
                      prev.cut){
  
  p.zeroes=apply(OTUdat[,-1],2,function(x){
    s=length(which(x==0))/length(x)
  })
  
  zeroes.dist=data.frame(colnames(OTUdat)[-1],p.zeroes,row.names=NULL)
  colnames(zeroes.dist)=c("Taxon","Proportion_zero")
  
  zero.plot = ggplot(zeroes.dist, aes(x=Proportion_zero)) + 
    geom_histogram(binwidth=0.1,colour="black",fill="white") + 
    xlab("Proportion of zeroes") + ylab("Number of taxa") +
    theme_bw()
  
  #print(zero.plot)
  
  OTUdat.thinned=OTUdat
  OTUdat.thinned=OTUdat.thinned[,c(1,1+which(p.zeroes<prev.cut))]
  
  asv.names=colnames(OTUdat.thinned)[-1]
  
  W.detected   <- ancom.W(OTUdat.thinned,Vardat,
                          adjusted,repeated,
                          main.var,adj.formula,
                          repeat.var,longitudinal,random.formula,
                          multcorr,sig)
  
  W_stat       <- W.detected
  
  
  ### Bubble plot
  
  W_frame = data.frame(asv.names,W_stat,row.names=NULL)
  W_frame = W_frame[order(-W_frame$W_stat),]
  
  W_frame$detected_0.9=rep(FALSE,dim(W_frame)[1])
  W_frame$detected_0.8=rep(FALSE,dim(W_frame)[1])
  W_frame$detected_0.7=rep(FALSE,dim(W_frame)[1])
  W_frame$detected_0.6=rep(FALSE,dim(W_frame)[1])
  
  W_frame$detected_0.9[which(W_frame$W_stat>0.9*(dim(OTUdat.thinned[,-1])[2]-1))]=TRUE
  W_frame$detected_0.8[which(W_frame$W_stat>0.8*(dim(OTUdat.thinned[,-1])[2]-1))]=TRUE
  W_frame$detected_0.7[which(W_frame$W_stat>0.7*(dim(OTUdat.thinned[,-1])[2]-1))]=TRUE
  W_frame$detected_0.6[which(W_frame$W_stat>0.6*(dim(OTUdat.thinned[,-1])[2]-1))]=TRUE
  
  final_results=list(W_frame,zero.plot)
  names(final_results)=c("W.taxa","PLot.zeroes")
  return(final_results)
}

#Load Function 'ANCOM.w' from S. Mandal (2017) ####
ancom.W = function(otu_data,var_data,
                   adjusted,repeated,
                   main.var,adj.formula,
                   repeat.var,long,rand.formula,
                   multcorr,sig){
  
  n_otu=dim(otu_data)[2]-1
  
  otu_ids=colnames(otu_data)[-1]
  
  if(repeated==F){
    data_comp=data.frame(merge(otu_data,var_data[,c("Sample.ID",main.var)],by="Sample.ID",row.names=NULL), check.names = F)
    #data_comp=data.frame(merge(otu_data,var_data[,c("Sample.ID",main.var)],by="Sample.ID",all.y=T),row.names=NULL)
  }else if(repeated==T){
    data_comp=data.frame(merge(otu_data,var_data,by=otu_data$Sample.ID),row.names=NULL, check.names=F)
    # data_comp=data.frame(merge(otu_data,var_data[,c("Sample.ID",main.var,repeat.var)],by="Sample.ID"),row.names=NULL)
  }
  
  base.formula = paste0("lr ~ ",main.var)
  if(repeated==T){
    repeat.formula = paste0(base.formula," | ", repeat.var)
  }
  if(adjusted==T){
    adjusted.formula = paste0(base.formula," + ", adj.formula)
  }
  
  if( adjusted == F & repeated == F ){
    fformula  <- formula(base.formula)
  } else if( adjusted == F & repeated == T & long == T ){
    fformula  <- formula(base.formula)   
  }else if( adjusted == F & repeated == T & long == F ){
    fformula  <- formula(repeat.formula)   
  }else if( adjusted == T & repeated == F  ){
    fformula  <- formula(adjusted.formula)   
  }else if( adjusted == T & repeated == T  ){
    fformula  <- formula(adjusted.formula)   
  }else{
    stop("Problem with data. Dataset should contain OTU abundances, groups, 
         and optionally an ID for repeated measures.")
  }
  
  if( repeated==FALSE & adjusted == FALSE){
    if( length(unique(data_comp[,which(colnames(data_comp)==main.var)]))==2 ){
      tfun <- exactRankTests::wilcox.exact
    } else{
      tfun <- stats::kruskal.test
    }
  }else if( repeated==FALSE & adjusted == TRUE){
    tfun <- stats::aov
  }else if( repeated== TRUE & adjusted == FALSE & long == FALSE){
    tfun <- stats::friedman.test
  }else if( repeated== TRUE & adjusted == FALSE & long == TRUE){
    tfun <- nlme::lme
  }else if( repeated== TRUE & adjusted == TRUE){
    tfun <- nlme::lme
  }
  
  logratio.mat <- matrix(NA, nrow=n_otu, ncol=n_otu)
  for(ii in 1:(n_otu-1)){
    for(jj in (ii+1):n_otu){
      data.pair <- data_comp[,which(colnames(data_comp)%in%otu_ids[c(ii,jj)])]
      lr <- log((1+as.numeric(data.pair[,1]))/(1+as.numeric(data.pair[,2])))
      
      datalr_dat <- data.frame( lr=lr, data_comp,row.names=NULL )
      
      if(adjusted==FALSE&repeated==FALSE){  ## Wilcox, Kruskal Wallis
        logratio.mat[ii,jj] <- tfun( formula=fformula, data = datalr_dat)$p.value
      }else if(adjusted==FALSE&repeated==TRUE&long==FALSE){ ## Friedman's 
        logratio.mat[ii,jj] <- tfun( formula=fformula, data = datalr_dat)$p.value
      }else if(adjusted==TRUE&repeated==FALSE){ ## ANOVA
        model=tfun(formula=fformula, data = datalr_dat,na.action=na.omit)   
        picker=which(gsub(" ","",row.names(summary(model)[[1]]))==main.var)  
        logratio.mat[ii,jj] <- summary(model)[[1]][["Pr(>F)"]][picker]
      }else if(repeated==TRUE&long==TRUE){ ## GEE
        model=tfun(fixed=fformula,data = datalr_dat,
                   random = formula(rand.formula),
                   correlation=corAR1(),
                   na.action=na.omit)   
        picker=which(gsub(" ","",row.names(anova(model)))==main.var)
        logratio.mat[ii,jj] <- anova(model)[["p-value"]][picker]
      }
      
    }
  } 
  
  ind <- lower.tri(logratio.mat)
  logratio.mat[ind] <- t(logratio.mat)[ind]
  
  
  logratio.mat[which(is.finite(logratio.mat)==FALSE)] <- 1
  
  mc.pval <- t(apply(logratio.mat,1,function(x){
    s <- p.adjust(x, method = "BH")
    return(s)
  }))
  
  a <- logratio.mat[upper.tri(logratio.mat,diag=FALSE)==TRUE]
  
  b <- matrix(0,ncol=n_otu,nrow=n_otu)
  b[upper.tri(b)==T] <- p.adjust(a, method = "BH")
  diag(b)  <- NA
  ind.1    <- lower.tri(b)
  b[ind.1] <- t(b)[ind.1]
  
  
  ### Code to extract surrogate p-value ####
  surr.pval <- apply(mc.pval,1,function(x){
    s0=quantile(x[which(as.numeric(as.character(x))<sig)],0.95)
    # s0=max(x[which(as.numeric(as.character(x))<alpha)])
    return(s0)
  })
  
  ### Conservative
  if(multcorr==1){
    W <- apply(b,1,function(x){
      subp <- length(which(x<sig))
    })
    ### Moderate
  } else if(multcorr==2){
    W <- apply(mc.pval,1,function(x){
      subp <- length(which(x<sig))
    })
    ### No correction
  } else if(multcorr==3){
    W <- apply(logratio.mat,1,function(x){
      subp <- length(which(x<sig))
    })
  }
  
  return(W)
}
##### End of functions ####

#ANCOM - 16s & 18s, host species as factor#####
#Create "Sample.ID" column all data tables
#ANCOM requires that data be formatted so that first *column* is named "Sample.ID"
#Sample IDs as row names does not count!
sar16s_sbst <- data.frame("Sample.ID" = row.names(sar16s.r), sar16s.r, check.names = F)
sar18s_sbst <- data.frame("Sample.ID" = row.names(sar18s.r), sar18s.r, check.names = F)
mdat_sbst <- data.frame("Sample.ID" = row.names(mdat), mdat)

#Check and make sure tables align
row.names(mdat_sbst) == row.names(sar18s_sbst)

#Run ANCOM, specify variable
ANCOM.16s <- ANCOM.main(sar16s_sbst,mdat_sbst,F,F,"Host_Spp",NULL,NULL,F,NULL,2,.05,.9)
ANCOM.18s <- ANCOM.main(sar18s_sbst,mdat_sbst,F,F,"Host_Spp",NULL,NULL,F,NULL,2,.05,.9)
#Create objects of significant ASVs
sigASVs_16s <- subset(ANCOM.16s$W.taxa, ANCOM.16s$W.taxa$W_stat > 0)[,1]
sigASVs_18s <- subset(ANCOM.18s$W.taxa, ANCOM.18s$W.taxa$W_stat > 0)[,1]

write.csv(ANCOM.16s$W.taxa, file = "16S_ANCOM_all_species.csv")
write.csv(ANCOM.18s$W.taxa, file = "18S_ANCOM_all_species.csv")

#ANCOM - 16s & 18s, leucophylla only, Longitude####
#Subset by Host Species of interest
sar16s_sbst <- subset(sar16s.r, mdat$Host_Spp %in% c("leucophylla"))
sar18s_sbst <- subset(sar18s.r, mdat$Host_Spp %in% c("leucophylla"))
mdat_sbst <- subset(mdat, mdat$Host_Spp %in% c("leucophylla"))
mdat_sbst$Host_Spp <- mdat_sbst$Host_Spp[drop=T]

sar16s_sbst <- sar16s_sbst[,colSums(sar16s_sbst) > 0] ## remove ASVs no longer present
sar18s_sbst <- sar18s_sbst[,colSums(sar18s_sbst) > 0] ## remove ASVs no longer present


#Create "Sample.ID" column all data tables
#ANCOM requires that data be formatted so that first *column* is named "Sample.ID"
#Sample IDs as row names does not count!
sar16s_sbst <- data.frame("Sample.ID" = row.names(sar16s_sbst), sar16s_sbst, check.names = F)
sar18s_sbst <- data.frame("Sample.ID" = row.names(sar18s_sbst), sar18s_sbst, check.names = F)
mdat_sbst <- data.frame("Sample.ID" = row.names(mdat_sbst), mdat_sbst)

#Check and make sure tables align
row.names(mdat_sbst) == row.names(sar18s_sbst)

#Run ANCOM, specify variable
ANCOM.16s <- ANCOM.main(sar16s_sbst,mdat_sbst,F,F,"Longitude",NULL,NULL,T,NULL,2,.05,.9)
ANCOM.18s <- ANCOM.main(sar18s_sbst,mdat_sbst,F,F,"Longitude",NULL,NULL,T,NULL,2,.05,.9)

#Create objects of significant ASVs
ANCOM.16s$W.taxa
ANCOM.18s$W.taxa
### No ASVs are signficant above 0.8 (or even 0.6) cutoff

#ANCOM - 16s & 18s, Splinter Hill only, host species ####
#Subset by site of interest
sar16s_sbst <- subset(sar16s.r, mdat$Site %in% c("Splinter_Hill"))
sar18s_sbst <- subset(sar18s.r, mdat$Site %in% c("Splinter_Hill"))
mdat_sbst <- subset(mdat, mdat$Site %in% c("Splinter_Hill"))
mdat_sbst$Host_Spp <- mdat_sbst$Host_Spp[drop=T]

#Create "Sample.ID" column all data tables
#ANCOM requires that data be formatted so that first *column* is named "Sample.ID"
#Sample IDs as row names does not count!
sar16s_sbst <- data.frame("Sample.ID" = row.names(sar16s_sbst), sar16s_sbst, check.names = F)
sar18s_sbst <- data.frame("Sample.ID" = row.names(sar18s_sbst), sar18s_sbst, check.names = F)
mdat_sbst <- data.frame("Sample.ID" = row.names(mdat_sbst), mdat_sbst)

#Check and make sure tables align
row.names(mdat_sbst) == row.names(sar18s_sbst)

#Run ANCOM, specify variable
ANCOM.16s <- ANCOM.main(sar16s_sbst,mdat_sbst,F,F,"Host_Spp",NULL,NULL,T,NULL,2,.05,.9)
ANCOM.18s <- ANCOM.main(sar18s_sbst,mdat_sbst,F,F,"Host_Spp",NULL,NULL,T,NULL,2,.05,.9)
#Create objects of significant ASVs
sigASVs_16s <- subset(ANCOM.16s$W.taxa, ANCOM.16s$W.taxa$W_stat > 0)[,1]
sigASVs_18s <- subset(ANCOM.18s$W.taxa, ANCOM.18s$W.taxa$W_stat > 0)[,1]

write.csv(ANCOM.16s$W.taxa, file = "16S_ANCOM_SH_host.csv")
write.csv(ANCOM.18s$W.taxa, file = "18S_ANCOM_SH_host.csv")

#Subset data by sig ASVs
sar16s_sigsbst <-  t(subset(t(sar16s_sbst), colnames(sar16s_sbst) %in% sigASVs_16s))
sar18s_sigsbst <-  t(subset(t(sar18s_sbst), colnames(sar18s_sbst) %in% sigASVs_18s))

spplot <- data.frame(sar16s_sigsbst[,1:3], "Host_Species" = mdat_sbst$Host_Spp, check.names = FALSE)
spplot[,1:3] <- lapply(spplot[,1:3], function(x) as.numeric(as.character(x)))
spplot[,1:3] <- log(spplot[,1:3]+1)

spplot.sub <- data.frame(sample=rownames(spplot),spplot, check.names = F)
spplot.sublong <- melt(spplot.sub)

spplot.sublong$Host_Species <-  factor(spplot.sublong$Host_Species, levels = c("rosea", "leuco-rosea", "leucophylla", "rubra"))

### Figure 2E - 16S SH top ANCOM ASVs ####
ggplot(spplot.sublong, aes(y = value, x = Host_Species, color=variable))+
  geom_boxplot(outlier.shape = NA) + 
  geom_point(position=position_dodge(width=0.75), aes(group=variable, shape = variable), alpha =.4) +
  ylab("Log  rel. abundance") + xlab("Host species") + theme_classic()

## Plot for 18s
spplot18s <- data.frame(sar18s_sigsbst[,1:6], "Host_Species" = mdat_sbst$Host_Spp, check.names = FALSE)
spplot18s[,1:6] <- lapply(spplot18s[,1:6], function(x) as.numeric(as.character(x)))
spplot18s[,1:6] <- log(spplot18s[,1:6]+1)

spplot18s.sub <- data.frame(sample=rownames(spplot18s),spplot18s, check.names = F)
spplot18s.sublong <- melt(spplot18s.sub)

spplot18s.sublong$Host_Species <-  factor(spplot18s.sublong$Host_Species, levels = c("rosea", "leuco-rosea", "leucophylla", "rubra"))

### Figure 2E - 18S SH top ANCOM ASVs ####
ggplot(spplot18s.sublong, aes(y = value, x = Host_Species, color=variable))+
  geom_boxplot(outlier.shape = NA) + 
  geom_point(position=position_dodge(width=0.75), aes(group=variable, shape = variable), alpha =.4) +
  ylab("Log  rel. abundance") + xlab("Host species") + theme_classic()


#ANCOM - Family level; 16s; hybrids and parent species ####
#Subset by Host Species of interest
##Prep data
#Subset by Host Species
sar_sbst1 <- subset(sar16s.r_family, mdat$Host_Spp %in% c("flava","flava-leuco","leuco"))
#Subset Metadata
mdat_sbst1 <- subset(mdat, row.names(mdat) %in% row.names(sar_sbst1))
#Create "Sample.ID" column in sar.r
sar_sbst1 <- data.frame("Sample.ID" = row.names(sar_sbst1), sar_sbst1, check.names = F)

#Create "Sample.ID" column in meta
mdat_sbst1 <- data.frame("Sample.ID" = row.names(mdat_sbst1), mdat_sbst1)
row.names(mdat_sbst1) == row.names(sar_sbst1)

#Subset by Host Species
sar_sbst2 <- subset(sar16s.r_family, mdat$Host_Spp %in% c("flava","flava-rosea","rosea"))
#Subset Metadata
mdat_sbst2 <- subset(mdat, row.names(mdat) %in% row.names(sar_sbst2))
#Create "Sample.ID" column in sar.r
sar_sbst2 <- data.frame("Sample.ID" = row.names(sar_sbst2), sar_sbst2, check.names = F)

#Create "Sample.ID" column in meta
mdat_sbst2 <- data.frame("Sample.ID" = row.names(mdat_sbst2), mdat_sbst2)
row.names(mdat_sbst2) == row.names(sar_sbst2)

#Subset by Host Species
sar_sbst3 <- subset(sar16s.r_family, mdat$Host_Spp %in% c("leucophylla","leuco-rosea","rosea"))
#Subset Metadata
mdat_sbst3 <- subset(mdat, row.names(mdat) %in% row.names(sar_sbst3))
#Create "Sample.ID" column in sar.r
sar_sbst3 <- data.frame("Sample.ID" = row.names(sar_sbst3), sar_sbst3, check.names = F)

#Create "Sample.ID" column in meta
mdat_sbst3 <- data.frame("Sample.ID" = row.names(mdat_sbst3), mdat_sbst3)
row.names(mdat_sbst3) == row.names(sar_sbst3)

#Run ANCOM, specify variable
ANCOM.16s.sbst1 <- ANCOM.main(sar_sbst1,mdat_sbst1,F,F,"Host_Spp",NULL,NULL,T,NULL,2,.05,.9)
#Create objects of significant ASVs
ANCOM.16s.sbst1.taxa <- ANCOM.16s.sbst1$W.taxa
write.csv(ANCOM.16s.sbst1$W.taxa, file = "16S_ANCOM_hybrids1.csv")

ANCOM.16s.sbst2 <- ANCOM.main(sar_sbst2,mdat_sbst2,F,F,"Host_Spp",NULL,NULL,T,NULL,2,.05,.9)
#Create objects of significant ASVs
ANCOM.16s.sbst2.taxa <- ANCOM.16s.sbst2$W.taxa
write.csv(ANCOM.16s.sbst2$W.taxa, file = "16S_ANCOM_hybrids2.csv")

ANCOM.16s.sbst3 <- ANCOM.main(sar_sbst3,mdat_sbst3,F,F,"Host_Spp",NULL,NULL,T,NULL,2,.05,.9)
#Create objects of significant ASVs
ANCOM.16s.sbst3.taxa <- ANCOM.16s.sbst3$W.taxa
write.csv(ANCOM.16s.sbst3$W.taxa, file = "16S_ANCOM_hybrids3.csv")


#### Violin plots with ANCOM families, Supplementary Figure 5 ####
### Cytophagaceae
aplot <- data.frame(Host = mdat_sbst2$Host_Spp,
                    Cytophagaceae = sar_sbst2$Cytophagaceae, check.names = F)
aplot.m <- reshape2::melt(aplot, id = "Host")
aplot.m$value <- log(aplot.m$value)
aplot.m$value[aplot.m$value == "-Inf"] <- 0
#Re-arrange factors if necessary
aplot.m$Host <- factor(aplot.m$Host, levels = c("flava", "flava-rosea", "rosea"))

ggplot(aplot.m, aes(y = value, x = factor(Host))) + 
  geom_violin(trim = FALSE) + geom_jitter(width = .025) + 
  stat_summary(fun = mean, geom="line", group= 1, color= "grey", size = 1.25, alpha = .75) +
  labs(subtitle = "Cytophagaceae", x = "Host Species", y = "Natural log of rel. abundance") +
  # scale_y_continuous(limits=c(0, 10))+
  theme_classic()

### Flavobacteriaceae
aplot <- data.frame(Host = mdat_sbst2$Host_Spp,
                    Flavobacteriaceae = sar_sbst2$Flavobacteriaceae, check.names = F)
aplot.m <- reshape2::melt(aplot, id = "Host")
aplot.m$value <- log(aplot.m$value)
aplot.m$value[aplot.m$value == "-Inf"] <- 0
#Re-arrange factors if necessary
aplot.m$Host <- factor(aplot.m$Host, levels = c("flava", "flava-rosea", "rosea"))

ggplot(aplot.m, aes(y = value, x = factor(Host))) + 
  geom_violin(trim = FALSE) + geom_jitter(width = .025) + 
  stat_summary(fun = mean, geom="line", group= 1, color= "grey", size = 1.25, alpha = .75) +
  labs(subtitle = "Flavobacteriaceae", x = "Host Species", y = "Natural log of rel. abundance") +
  # scale_y_continuous(limits=c(0, 10))+
  theme_classic()

### Hyphomicrobiaceae
aplot <- data.frame(Host = mdat_sbst2$Host_Spp,
                    Hyphomicrobiaceae = sar_sbst2$Hyphomicrobiaceae, check.names = F)
aplot.m <- reshape2::melt(aplot, id = "Host")
aplot.m$value <- log(aplot.m$value)
aplot.m$value[aplot.m$value == "-Inf"] <- 0
#Re-arrange factors if necessary
aplot.m$Host <- factor(aplot.m$Host, levels = c("flava", "flava-rosea", "rosea"))

ggplot(aplot.m, aes(y = value, x = factor(Host))) + 
  geom_violin(trim = FALSE) + geom_jitter(width = .025) + 
  stat_summary(fun = mean, geom="line", group= 1, color= "grey", size = 1.25, alpha = .75) +
  labs(subtitle = "Hyphomicrobiaceae", x = "Host Species", y = "Natural log of rel. abundance") +
  # scale_y_continuous(limits=c(0, 10))+
  theme_classic()

#### Acetobacteraceae
aplot <- data.frame(Host = mdat_sbst2$Host_Spp,
                    Acetobacteraceae = sar_sbst2$Acetobacteraceae, check.names = F)
aplot.m <- reshape2::melt(aplot, id = "Host")
aplot.m$value <- log(aplot.m$value)
aplot.m$value[aplot.m$value == "-Inf"] <- 0
#Re-arrange factors if necessary
aplot.m$Host <- factor(aplot.m$Host, levels = c("flava", "flava-rosea", "rosea"))

ggplot(aplot.m, aes(y = value, x = factor(Host))) + 
  geom_violin(trim = FALSE) + geom_jitter(width = .025) + 
  stat_summary(fun = mean, geom="line", group= 1, color= "grey", size = 1.25, alpha = .75) +
  labs(subtitle = "Acetobacteraceae", x = "Host Species", y = "Natural log of rel. abundance") +
  # scale_y_continuous(limits=c(0, 10))+
  theme_classic()

### Oxalobacteraceae
aplot <- data.frame(Host = mdat_sbst2$Host_Spp,
                    Oxalobacteraceae = sar_sbst2$Oxalobacteraceae, check.names = F)
aplot.m <- reshape2::melt(aplot, id = "Host")
aplot.m$value <- log(aplot.m$value)
aplot.m$value[aplot.m$value == "-Inf"] <- 0
#Re-arrange factors if necessary
aplot.m$Host <- factor(aplot.m$Host, levels = c("flava", "flava-rosea", "rosea"))

ggplot(aplot.m, aes(y = value, x = factor(Host))) + 
  geom_violin(trim = FALSE) + geom_jitter(width = .025) + 
  stat_summary(fun = mean, geom="line", group= 1, color= "grey", size = 1.25, alpha = .75) +
  labs(subtitle = "Oxalobacteraceae", x = "Host Species", y = "Natural log of rel. abundance") +
  # scale_y_continuous(limits=c(0, 10))+
  theme_classic()

### Porphyromonadaceae
aplot <- data.frame(Host = mdat_sbst2$Host_Spp,
                    Porphyromonadaceae = sar_sbst2$Porphyromonadaceae, check.names = F)
aplot.m <- reshape2::melt(aplot, id = "Host")
aplot.m$value <- log(aplot.m$value)
aplot.m$value[aplot.m$value == "-Inf"] <- 0
#Re-arrange factors if necessary
aplot.m$Host <- factor(aplot.m$Host, levels = c("flava", "flava-rosea", "rosea"))

ggplot(aplot.m, aes(y = value, x = factor(Host))) + 
  geom_violin(trim = FALSE) + geom_jitter(width = .025) + 
  stat_summary(fun = mean, geom="line", group= 1, color= "grey", size = 1.25, alpha = .75) +
  labs(subtitle = "Porphyromonadaceae", x = "Host Species", y = "Natural log of rel. abundance") +
  # scale_y_continuous(limits=c(0, 10))+
  theme_classic()

### Beijerinckiaceae
aplot <- data.frame(Host = mdat_sbst2$Host_Spp,
                    Beijerinckiaceae = sar_sbst2$Beijerinckiaceae, check.names = F)
aplot.m <- reshape2::melt(aplot, id = "Host")
aplot.m$value <- log(aplot.m$value)
aplot.m$value[aplot.m$value == "-Inf"] <- 0
#Re-arrange factors if necessary
aplot.m$Host <- factor(aplot.m$Host, levels = c("flava", "flava-rosea", "rosea"))

ggplot(aplot.m, aes(y = value, x = factor(Host))) + 
  geom_violin(trim = FALSE) + geom_jitter(width = .025) + 
  stat_summary(fun = mean, geom="line", group= 1, color= "grey", size = 1.25, alpha = .75) +
  labs(subtitle = "Beijerinckiaceae", x = "Host Species", y = "Natural log of rel. abundance") +
  # scale_y_continuous(limits=c(0, 10))+
  theme_classic()


#### Leuco-Rosea fams
### Flavobacteriaceae
aplot <- data.frame(Host = mdat_sbst3$Host_Spp,
                    Flavobacteriaceae = sar_sbst3$Flavobacteriaceae, check.names = F)
aplot.m <- reshape2::melt(aplot, id = "Host")
aplot.m$value <- log(aplot.m$value)
aplot.m$value[aplot.m$value == "-Inf"] <- 0
#Re-arrange factors if necessary
aplot.m$Host <- factor(aplot.m$Host, levels = c("leucophylla", "leuco-rosea", "rosea"))

ggplot(aplot.m, aes(y = value, x = factor(Host))) + 
  geom_violin(trim = FALSE) + geom_jitter(width = .025) + 
  stat_summary(fun = mean, geom="line", group= 1, color= "grey", size = 1.25, alpha = .75) +
  labs(subtitle = "Flavobacteriaceae", x = "Host Species", y = "Natural log of rel. abundance") +
  # scale_y_continuous(limits=c(0, 10))+
  theme_classic()

### Cytophagaceae
aplot <- data.frame(Host = mdat_sbst3$Host_Spp,
                    Cytophagaceae = sar_sbst3$Cytophagaceae, check.names = F)
aplot.m <- reshape2::melt(aplot, id = "Host")
aplot.m$value <- log(aplot.m$value)
aplot.m$value[aplot.m$value == "-Inf"] <- 0
#Re-arrange factors if necessary
aplot.m$Host <- factor(aplot.m$Host, levels = c("leucophylla", "leuco-rosea", "rosea"))

ggplot(aplot.m, aes(y = value, x = factor(Host))) + 
  geom_violin(trim = FALSE) + geom_jitter(width = .025) + 
  stat_summary(fun = mean, geom="line", group= 1, color= "grey", size = 1.25, alpha = .75) +
  labs(subtitle = "Cytophagaceae", x = "Host Species", y = "Natural log of rel. abundance") +
  # scale_y_continuous(limits=c(0, 10))+
  theme_classic()

### Oxalobacteraceae
aplot <- data.frame(Host = mdat_sbst3$Host_Spp,
                    Oxalobacteraceae = sar_sbst3$Oxalobacteraceae, check.names = F)
aplot.m <- reshape2::melt(aplot, id = "Host")
aplot.m$value <- log(aplot.m$value)
aplot.m$value[aplot.m$value == "-Inf"] <- 0
#Re-arrange factors if necessary
aplot.m$Host <- factor(aplot.m$Host, levels = c("leucophylla", "leuco-rosea", "rosea"))

ggplot(aplot.m, aes(y = value, x = factor(Host))) + 
  geom_violin(trim = FALSE) + geom_jitter(width = .025) + 
  stat_summary(fun = mean, geom="line", group= 1, color= "grey", size = 1.25, alpha = .75) +
  labs(subtitle = "Oxalobacteraceae", x = "Host Species", y = "Natural log of rel. abundance") +
  # scale_y_continuous(limits=c(0, 10))+
  theme_classic()

### Hyphomicrobiaceae
aplot <- data.frame(Host = mdat_sbst3$Host_Spp,
                    Hyphomicrobiaceae = sar_sbst3$Hyphomicrobiaceae, check.names = F)
aplot.m <- reshape2::melt(aplot, id = "Host")
aplot.m$value <- log(aplot.m$value)
aplot.m$value[aplot.m$value == "-Inf"] <- 0
#Re-arrange factors if necessary
aplot.m$Host <- factor(aplot.m$Host, levels = c("leucophylla", "leuco-rosea", "rosea"))

ggplot(aplot.m, aes(y = value, x = factor(Host))) + 
  geom_violin(trim = FALSE) + geom_jitter(width = .025) + 
  stat_summary(fun = mean, geom="line", group= 1, color= "grey", size = 1.25, alpha = .75) +
  labs(subtitle = "Hyphomicrobiaceae", x = "Host Species", y = "Natural log of rel. abundance") +
  # scale_y_continuous(limits=c(0, 10))+
  theme_classic()

### Patulibacteraceae
aplot <- data.frame(Host = mdat_sbst3$Host_Spp,
                    Patulibacteraceae = sar_sbst3$Patulibacteraceae, check.names = F)
aplot.m <- reshape2::melt(aplot, id = "Host")
aplot.m$value <- log(aplot.m$value)
aplot.m$value[aplot.m$value == "-Inf"] <- 0
#Re-arrange factors if necessary
aplot.m$Host <- factor(aplot.m$Host, levels = c("leucophylla", "leuco-rosea", "rosea"))

ggplot(aplot.m, aes(y = value, x = factor(Host))) + 
  geom_violin(trim = FALSE) + geom_jitter(width = .025) + 
  stat_summary(fun = mean, geom="line", group= 1, color= "grey", size = 1.25, alpha = .75) +
  labs(subtitle = "Patulibacteraceae", x = "Host Species", y = "Natural log of rel. abundance") +
  # scale_y_continuous(limits=c(0, 10))+
  theme_classic()

### Xanthobacteraceae
aplot <- data.frame(Host = mdat_sbst3$Host_Spp,
                    Xanthobacteraceae = sar_sbst3$Xanthobacteraceae, check.names = F)
aplot.m <- reshape2::melt(aplot, id = "Host")
aplot.m$value <- log(aplot.m$value)
aplot.m$value[aplot.m$value == "-Inf"] <- 0
#Re-arrange factors if necessary
aplot.m$Host <- factor(aplot.m$Host, levels = c("leucophylla", "leuco-rosea", "rosea"))

ggplot(aplot.m, aes(y = value, x = factor(Host))) + 
  geom_violin(trim = FALSE) + geom_jitter(width = .025) + 
  stat_summary(fun = mean, geom="line", group= 1, color= "grey", size = 1.25, alpha = .75) +
  labs(subtitle = "Xanthobacteraceae", x = "Host Species", y = "Natural log of rel. abundance") +
  # scale_y_continuous(limits=c(0, 10))+
  theme_classic()

### Porphyromonadaceae
aplot <- data.frame(Host = mdat_sbst3$Host_Spp,
                    Porphyromonadaceae = sar_sbst3$Porphyromonadaceae, check.names = F)
aplot.m <- reshape2::melt(aplot, id = "Host")
aplot.m$value <- log(aplot.m$value)
aplot.m$value[aplot.m$value == "-Inf"] <- 0
#Re-arrange factors if necessary
aplot.m$Host <- factor(aplot.m$Host, levels = c("leucophylla", "leuco-rosea", "rosea"))

ggplot(aplot.m, aes(y = value, x = factor(Host))) + 
  geom_violin(trim = FALSE) + geom_jitter(width = .025) + 
  stat_summary(fun = mean, geom="line", group= 1, color= "grey", size = 1.25, alpha = .75) +
  labs(subtitle = "Porphyromonadaceae", x = "Host Species", y = "Natural log of rel. abundance") +
  # scale_y_continuous(limits=c(0, 10))+
  theme_classic()

### Cellulomonadaceae
aplot <- data.frame(Host = mdat_sbst3$Host_Spp,
                    Cellulomonadaceae = sar_sbst3$Cellulomonadaceae, check.names = F)
aplot.m <- reshape2::melt(aplot, id = "Host")
aplot.m$value <- log(aplot.m$value)
aplot.m$value[aplot.m$value == "-Inf"] <- 0
#Re-arrange factors if necessary
aplot.m$Host <- factor(aplot.m$Host, levels = c("leucophylla", "leuco-rosea", "rosea"))

ggplot(aplot.m, aes(y = value, x = factor(Host))) + 
  geom_violin(trim = FALSE) + geom_jitter(width = .025) + 
  stat_summary(fun = mean, geom="line", group= 1, color= "grey", size = 1.25, alpha = .75) +
  labs(subtitle = "Cellulomonadaceae", x = "Host Species", y = "Natural log of rel. abundance") +
  # scale_y_continuous(limits=c(0, 10))+
  theme_classic()



## Venn diagrams w/ hybrids for Figure 4 ####
#Microbiome normalization function####
#This function normalizes the microbiome for a single species
#It takes the community data-set and metadata, and subsets the data by the specified host species
#This function will produce one randomly sampled microbiome for every iteration entered.
#The random sample is a sample of all ASVs present in all of the samples for the specified host species 
#The number of ASVs in the product is equal to the mean number present in all samples for specified host species
#The probability that an ASV with be "randomly" sampled for the product is weighted by its abundance in the samples of the host species.
#The # of iterations defaults to 0, when comparing different host species the # of iterations should be specified as the lowest number of samples for a host species of interest. 
random_avg_microbiome <- function(Data, Metadata, Host_Species, Iterations = 0){
  
  a <- c()
  
  for(i in 1:Iterations){
    a <- append(a, sample(colnames(subset(Data, Metadata$Host_Spp %in% Host_Species)), 
                          size= mean(t(data.frame(rowSums(1*(subset(Data, Metadata$Host_Spp %in% Host_Species)>0))))),
                          replace = FALSE, 
                          prob = c(colSums(1*(subset(Data, Metadata$Host_Spp %in% Host_Species)>0))
                                   / nrow(subset(Data, Metadata$Host_Spp %in% Host_Species)))))
  }
  return(unique(a))
}
#####

#Generate plots comparing host species and their hybrids
#Fig. 4F Venn - 16s; flava, leucophylla, flava-leucophylla only####
set.seed(111)
f <- random_avg_microbiome(Data = sar16s.r, Metadata = mdat, Host_Species = "flava", 
                           Iterations =  sum(mdat$Host_Spp %in% "flava-leuco"))
set.seed(111)
fl <- random_avg_microbiome(Data = sar16s.r, Metadata = mdat, Host_Species = "flava-leuco", 
                            Iterations =  sum(mdat$Host_Spp %in% "flava-leuco"))
set.seed(111)
l <- random_avg_microbiome(Data = sar16s.r, Metadata = mdat, Host_Species = "leucophylla", 
                           Iterations =  sum(mdat$Host_Spp %in% "flava-leuco"))

plotVenn(list("Flava" = f, "Flava-Leucophylla"=fl, "Leucophylla"=l),
         nCycles = 7000, showPlot = T, setColors = c("blue", "forestgreen", "gold"))

#Fig. 4L Venn - 18s; flava, leucophylla, flava-leucophylla only####
set.seed(111)
f <- random_avg_microbiome(Data = sar18s.r, Metadata = mdat, Host_Species = "flava", 
                           Iterations =  sum(mdat$Host_Spp %in% "flava-leuco"))
set.seed(111)
fl <- random_avg_microbiome(Data = sar18s.r, Metadata = mdat, Host_Species = "flava-leuco", 
                            Iterations =  sum(mdat$Host_Spp %in% "flava-leuco"))
set.seed(111)
l <- random_avg_microbiome(Data = sar18s.r, Metadata = mdat, Host_Species = "leucophylla", 
                           Iterations =  sum(mdat$Host_Spp %in% "flava-leuco"))

plotVenn(list("Flava" = f, "Flava-Leucophylla"=fl, "Leucophylla"=l),
         nCycles = 7000, showPlot = T, setColors = c("blue", "forestgreen", "gold"))

#Fig. 4D Venn - 16s; flava, flava-rosea, rosea only####
set.seed(111)
f <- random_avg_microbiome(Data = sar16s.r, Metadata = mdat, Host_Species = "flava", 
                           Iterations =  sum(mdat$Host_Spp %in% "flava-rosea"))
set.seed(111)
fr <- random_avg_microbiome(Data = sar16s.r, Metadata = mdat, Host_Species = "flava-rosea", 
                            Iterations =  sum(mdat$Host_Spp %in% "flava-rosea"))
set.seed(111)
r <- random_avg_microbiome(Data = sar16s.r, Metadata = mdat, Host_Species = "rosea", 
                           Iterations =  sum(mdat$Host_Spp %in% "flava-rosea"))

plotVenn(list("Flava" = f, "Flava-Rosea"=fr, "Rosea"=r),
         nCycles = 7000, showPlot = T, setColors = c("blue", "purple", "red"))

#Fig. 4J Venn - 18s; flava, flava-rosea, rosea only####
set.seed(111)
f <- random_avg_microbiome(Data = sar18s.r, Metadata = mdat, Host_Species = "flava", 
                           Iterations =  sum(mdat$Host_Spp %in% "flava-rosea"))
set.seed(111)
fr <- random_avg_microbiome(Data = sar18s.r, Metadata = mdat, Host_Species = "flava-rosea", 
                            Iterations =  sum(mdat$Host_Spp %in% "flava-rosea"))
set.seed(111)
r <- random_avg_microbiome(Data = sar18s.r, Metadata = mdat, Host_Species = "rosea", 
                           Iterations =  sum(mdat$Host_Spp %in% "flava-rosea"))

plotVenn(list("Flava" = f, "Flava-Rosea"=fr, "Rosea"=r),
         nCycles = 7000, showPlot = T, setColors = c("blue", "purple", "red"))

#Fig. 4E Venn - 16s; leucophylla, leuco-rosea, rosea only####
set.seed(111)
r <- random_avg_microbiome(Data = sar16s.r, Metadata = mdat, Host_Species = "rosea", 
                           Iterations =  sum(mdat$Host_Spp %in% "leuco-rosea"))
set.seed(111)
lr <- random_avg_microbiome(Data = sar16s.r, Metadata = mdat, Host_Species = "leuco-rosea", 
                            Iterations =  sum(mdat$Host_Spp %in% "leuco-rosea"))
set.seed(111)
l <- random_avg_microbiome(Data = sar16s.r, Metadata = mdat, Host_Species = "leucophylla", 
                           Iterations =  sum(mdat$Host_Spp %in% "leuco-rosea"))

plotVenn(list("Leucophylla"=l, "Leucophylla-Rosea"=lr, "Rosea"=r),
         nCycles = 7000, showPlot = T, setColors = c("gold", "orange", "red"))

#Fig. 4K Venn - 18s; leucophylla, leuco-rosea, rosea only####
set.seed(111)
r <- random_avg_microbiome(Data = sar18s.r, Metadata = mdat, Host_Species = "rosea", 
                           Iterations =  sum(mdat$Host_Spp %in% "leuco-rosea"))
set.seed(111)
lr <- random_avg_microbiome(Data = sar18s.r, Metadata = mdat, Host_Species = "leuco-rosea", 
                            Iterations =  sum(mdat$Host_Spp %in% "leuco-rosea"))
set.seed(111)
l <- random_avg_microbiome(Data = sar18s.r, Metadata = mdat, Host_Species = "leucophylla", 
                           Iterations =  sum(mdat$Host_Spp %in% "leuco-rosea"))

plotVenn(list("Leucophylla"=l, "Leucophylla-Rosea"=lr, "Rosea"=r),
         nCycles = 7000, showPlot = T, setColors = c("gold", "orange", "red"))

### WorldClim Variables for Sites ####
clim <- read.csv("climatedata.csv", row.names = 1)

# Plot data
mclim <- melt(clim,id.vars=c("Latitude","Longitude"))

lat.plot <- ggplot(mclim, aes(x=Latitude, y=value, colour=variable, group=variable )) + 
  geom_point()
lat.plot + facet_grid(rows = vars(variable), scales = "free")

long.plot <- ggplot(mclim, aes(x=Longitude, y=value, colour=variable, group=variable )) + 
  geom_point()
long.plot + facet_grid(rows = vars(variable), scales = "free")

# Linear Models of WorldClim variables with lat/long ####
lm1 <- lm(Latitude ~ Annual.Mean.Temperature, data = clim)
summary(lm1)

lm2 <- lm(Longitude ~ Annual.Precipitation, data = clim)
summary(lm2)

